#Data Cleaning-part1
library(dplyr)
library(tidyverse)
library(rsample)
wine <- read_csv(file="winemag-data-130k-v2.csv")
y <- gsub("[[:alpha:][:punct:]]","",wine$title)
wine1 <- wine%>%
mutate(title1=as.numeric(y))
names(wine1)
## [1] "X1" "country"
## [3] "description" "designation"
## [5] "points" "price"
## [7] "province" "region_1"
## [9] "region_2" "taster_name"
## [11] "taster_twitter_handle" "title"
## [13] "variety" "winery"
## [15] "title1"
Winefruit <- dplyr::filter(wine, grepl("fruit",wine$description))
Wineearthy <- dplyr::filter(wine, grepl("earth",wine$description))
Winefloral <- dplyr::filter(wine, grepl("floral",wine$description))
Count <- c(55547,8337,3537)
Type <- c('Fruity', 'Earthy', 'Floral')
Type <- as.factor(Type)
Typewine <- data.frame(Type, Count)
ggplot(Typewine, aes(Type))+
geom_col(mapping=aes(y=Count), fill="darkred")
#Price and Rating Point relationship by variety We start off by discovering the varieties of wine that are included in the dataset. We make two bar graphs showing the average price and average point of the top 10 varieties. We could detect certain pattern, as for example, merlot, rose, and sauvignon blanc have relatively low average price per bottle and average rating point. Also, pinot noir and maybe bordeaux-style red blend have relatively high average price per bottle and average rating point. So we explore further about the correlation between price and point by making the graph as follows. For efficiency purposes, we take the top 6 varieties of wine this time - since we have checked the numbers of records for each variety, and the 6th variety, ranked descendingly, is Riesling and it corresponds to about 5,189 records, which is a decent amount for a dataset of about 130,000 records. From the relationship graph, and especially the trend line generated, we could see that there exists certain kind of positive correlation between price and point.
winetop <- wine %>%
group_by(variety) %>%
summarise(number = n())%>%
arrange(desc(number))
wine %>%
filter(variety == 'Pinot Noir' | variety == 'Chardonnay' | variety == 'Cabernet Sauvignon' | variety == 'Red Blend' | variety == 'Bordeaux-style Red Blend' | variety == 'Riesling' | variety == 'Sauvignon Blanc'| variety == 'Syrah' | variety == 'Rosé' | variety == 'Merlot' ) %>%
filter(is.na(price) != TRUE) %>%
group_by(variety) %>%
summarize(avg_price_by_variety=mean(price)) %>%
ggplot(.,aes(x=reorder(variety, avg_price_by_variety), avg_price_by_variety)) +
geom_bar(stat="identity", fill="darkred") +
scale_y_continuous("Average Price ($)", breaks=seq(0,60, by=10))+
ggtitle("Average Price per Bottle",
subtitle = "Top 10 Varieties") +
coord_flip() +
theme(panel.grid = element_blank(),
panel.background = element_blank(),
axis.title.y = element_blank())
wine %>%
filter(variety == 'Pinot Noir' | variety == 'Chardonnay' | variety == 'Cabernet Sauvignon' | variety == 'Red Blend' | variety == 'Bordeaux-style Red Blend' | variety == 'Riesling' | variety == 'Sauvignon Blanc'| variety == 'Syrah' | variety == 'Rosé' | variety == 'Merlot' ) %>%
filter(is.na(points) != TRUE) %>%
group_by(variety) %>%
summarize(avg_point_by_variety=mean(points)) %>%
ggplot(.,aes(x=reorder(variety, avg_point_by_variety), avg_point_by_variety)) +
geom_bar(stat="identity", fill="darkblue") +
scale_y_continuous("Average Rating", breaks=seq(0,100, by=20))+
ggtitle("Average Rating per Bottle",
subtitle = "Top 10 Varieties") +
coord_flip() +
theme(panel.grid = element_blank(),
panel.background = element_blank(),
axis.title.y = element_blank())
#Correlation between Price and Rating of wine.
winefilter <- wine %>%
filter(variety == 'Pinot Noir' | variety == 'Chardonnay' | variety == 'Cabernet Sauvignon' | variety == 'Red Blend' | variety == 'Bordeaux-style Red Blend' | variety == 'Riesling')
sample <- winefilter[sample(nrow(winefilter), 250), ]
ggplot(sample)+
geom_point(mapping = aes(x = points, y = price, color = variety), alpha = 0.5)+
geom_smooth(mapping = aes(x = points, y = price), se = FALSE) +
theme_classic()+
ylab('Price')+
xlab('Ratings')+
ylim(0,300)
#Animation of price and rating overtimes.(by different countires)
#This animation provides us two insight.The first one is that the aged wine is mainly coming from Italy, France and Germany, which represented by more greenish dots. The new wine is mostly coming from U.S. and Spain, which represented by more reddish color. Another insight is that the price range of aged wine is larger than new wine.
library(plotly)
library(ggplot2)
# Create the plot
wine1<- wine1%>%
filter(title1>=1994&title1<=2017)
p <- ggplot(wine1, aes(x=points, y=price, color=country))+
geom_point(aes(frame=title1),alpha=0.6)+
ggtitle("Price vs Rating Over Times",
subtitle="Wine Vintage Throughout the World")+
xlim(75,100)+
ylim(0,400)+
theme_classic()+
labs(x="Rating", y = 'Price', color="Country")
p <- p %>%
animation_slider(
currentvalue = list(prefix = "YEAR ", font = list(color="red")))
wine2 <- ggplotly(p, height = 600, width = 500) %>%
animation_opts(
easing = "linear",
redraw =FALSE)
wine2